import stopwordsiso as stopwords
import classla
import pandas
import gensim
from gensim.models.phrases import Phraser


input_file = "DATA_sputnik_rs_vaccine.csv"
output_file = "TOKENS_sputnik_rs_vaccine.csv"


added_stopwords = ["", " ", " \n", "\n", "kod", "iza", "ovaj", "će", "reći", "moći"]




#Data import
df = pandas.read_csv(input_file)

headlines = df["title"].tolist()
articles = df["full_text"].tolist()

# Couldn't find Serbian stopwords but Croatian will be close enough, can always add to it
# It creates a set not a list so have to convert to list
stopword_list = list(stopwords.stopwords("hr"))
for stopword in added_stopwords:
    stopword_list.append(stopword)

# Tokenizing first, otherwise it was weirdly separating out ending accent marks in the automatic classla tokenization for this character ć
# splitting with split() splits on all white space - before was doing split(" "), which was causing errors because it created an incorrect number of words
# pre-tokenization for classla requires "sentences" that can't be too long, and since I don't have punctuation in my data, that means I am asking it to treat each word as a sentence
# To do this requires that each word (token) be its own list (sentence)
# This is not a standard meaning of tokenization, if you do a standard form (just the words in a list) the lemmatization below splits so it is then single characters 
# This line of code credited to Jonathan Keegan, who kindly assisted me when I was stuck. He had to look in the base code of the classla package to figure out the format that classla's tokenization creates, because it was not included in the code documentation
tokenized_articles = [[[token for token in sentence.split()] for sentence in article.split()] for article in articles]

# This lemmatizes Serbian
# Classla seems like the best Serbian lemmatizer, documentation here: https://github.com/clarinsi/classla#readme
# Information on the overal Stanford NLP here: https://stanfordnlp.github.io/stanfordnlp/lemma.html
nlp = classla.Pipeline("sr", tokenize_pretokenized = True)
lemmatized_articles = []
for article in tokenized_articles:
    print("Lemmatizing document: " + str(tokenized_articles.index(article) + 1) + " out of " + str(len(articles)))
    doc = nlp(article)
    list_lemmas = [word.lemma for sent in doc.sentences for word in sent.words if word.lemma not in stopword_list and word not in ["'token': '́'"]]
    lemmatized_articles.append(list_lemmas)
    print(list_lemmas)

print("Finished lemmatizing")
#Got code from here: https://stackoverflow.com/questions/46129335/get-bigrams-and-trigrams-in-word2vec-gensim
#For bigrams to work, need the data to be structured as a list of lists, where the inner lists are each of the texts tokenized and lemmatized
#In theory the texts are called "sentences" in the documentation, but they don't have to be sentences per say, just have to be overall a list of lists of words
#Higher min count and threshold means fewer phrases. 
bigram = gensim.models.phrases.Phrases(lemmatized_articles, min_count=5, threshold=10)
bigram_phraser = Phraser(bigram)
bigram_articles = []
for article in lemmatized_articles:
    bigram_tokens = bigram_phraser[article]
    bigram_articles.append(bigram_tokens)

print("Finished bigrams")

#Now take the above results and put it through again to make trigrams:
trigram = gensim.models.phrases.Phrases(bigram_articles, min_count = 5, threshold = 10)
trigram_phraser = Phraser(trigram)
trigram_articles = []
for article in bigram_articles:
    trigram_tokens = trigram_phraser[article]
    trigram_articles.append(trigram_tokens)

print("Finished trigrams")

# This is the format that the tidymodels topic modeling in R likes
text_tidy = []


for article in trigram_articles:
    for token in article:
        d = {}
        d["token"] = token
        d["index"] = trigram_articles.index(article)
        text_tidy.append(d)
  

df = pandas.DataFrame(text_tidy)
df.to_csv(output_file,  encoding = "utf-8-sig")


